Summary

This script takes the raw data downloaded from Crimson Hexagon and cleans it up for analysis. At the bottom of the script there are a few exploratory maps.

Setup

knitr::opts_chunk$set(message = F, warning = F)

library(tidyverse)
library(jsonlite)
library(ggmap)
library(leaflet)
library(sf)
library(readxl)
library(reticulate)
library(RColorBrewer)
library(kableExtra)
library(mapview)

Data cleaning

Crimson Hexagon data is saved in two day bulk exports. The CH website only allows exports of 10,000 randomly selected tweets. There seemed to be between 10-15k over any 2 day period so data was exported in 2-day chunks to try and get as much data as possible. Two filters were applied to the data before downloading - the location was set to Santa Barbara (this does not mean the tweet was geotagged but that it came from the area) and that it was an Original Tweet (not a retweet).

# list all .xlsx files
xl_files <- list.files("../data/daily", pattern = ".xlsx", full.names = TRUE)

ids <- data.frame()

for(i in 1:length(xl_files)){
  print(i)
  #get twitter IDs from the Crimson Hexagon output
ch_data <- read_excel(xl_files[i], skip = 1) %>%
  select(GUID)
  
ids <- rbind(ch_data, ids)
}

nums <- seq(1, nrow(ids), length.out = 30)

for(i in 1:29){
  
  n <- nums[i]
  n2 <- nums[i+1]
  df <- ids[n:n2,]
  
#save as .txt file to be read by the python twarc library
write.table(as.numeric(df$GUID), file = paste0("../data/twitter_ids_", i, ".txt"), sep = "\t",
            row.names = FALSE, col.names = FALSE)
}

Now I use the python library, twarc in my terminal to “hydrate” the data using the tweet IDs. The Crimson Hexagon data does not give us much information but the twarc library lets us use the twitter id to grab a lot more information (including coordinates for geotagged tweets).

Once this is done, all tweets are saved in a JSON file.

# Give the input file name to the function.# 

tweets1 <- stream_in(file("../data/tweets1.jsonl")) 
tweets2 <- stream_in(file("../data/tweets2.jsonl")) 
tweets3 <- stream_in(file("../data/tweets3.jsonl")) 
tweets4 <- stream_in(file("../data/tweets4.jsonl")) 
tweets5 <- stream_in(file("../data/tweets5.jsonl")) 
tweets6 <- stream_in(file("../data/tweets6.jsonl")) 
tweets7 <- stream_in(file("../data/tweets7.jsonl")) 
tweets8 <- stream_in(file("../data/tweets8.jsonl")) 
tweets9 <- stream_in(file("../data/tweets9.jsonl")) 
tweets10 <- stream_in(file("../data/tweets10.jsonl")) 
tweets11 <- stream_in(file("../data/tweets11.jsonl")) 
tweets12 <- stream_in(file("../data/tweets12.jsonl")) 
tweets13 <- stream_in(file("../data/tweets13.jsonl")) 
tweets14 <- stream_in(file("../data/tweets14.jsonl")) 
tweets15 <- stream_in(file("../data/tweets15.jsonl")) 
tweets16 <- stream_in(file("../data/tweets16.jsonl")) 
tweets17 <- stream_in(file("../data/tweets17.jsonl")) 
tweets18 <- stream_in(file("../data/tweets18.jsonl")) 
tweets19 <- stream_in(file("../data/tweets19.jsonl")) 
tweets20 <- stream_in(file("../data/tweets20.jsonl")) 
tweets21 <- stream_in(file("../data/tweets21.jsonl")) 
tweets22 <- stream_in(file("../data/tweets22.jsonl")) 
tweets23 <- stream_in(file("../data/tweets23.jsonl")) 
tweets24 <- stream_in(file("../data/tweets24.jsonl")) 
tweets25 <- stream_in(file("../data/tweets25.jsonl")) 
tweets26 <- stream_in(file("../data/tweets26.jsonl")) 
tweets27 <- stream_in(file("../data/tweets27.jsonl")) 
tweets28 <- stream_in(file("../data/tweets28.jsonl")) 
tweets29 <- stream_in(file("../data/tweets29.jsonl")) 
create_tweet_df <- function(tweets){

  
#get the columns we want from the json (some are nested)
tweet_df <- as_tibble(cbind(
as.character(tweets$created_at),
as.numeric(tweets$id_str),
as.character(tweets$full_text),
as.numeric(tweets$user$id_str),
as.character(tweets$user$location),
as.character(tweets$geo$type),
as.character(tweets$geo$coordinates),
as.character(tweets$lang),
as.numeric(tweets$retweet_count),
as.numeric(tweets$favorite_count)))

#assign column names
names(tweet_df) <- c("created_at","tweet_id","full_text","user_id","user_location",
              "geo_type", "geo_coordinates", "language", "retweet_count", "favorite_count")

## filter
tweets_geo <- tweet_df %>%
  filter(!is.na(geo_type)) %>%
  mutate(tweet_id = as.numeric(tweet_id),
         user_id = as.numeric(user_id),
         retweet_count = as.numeric(retweet_count),
         favorite_count = as.numeric(favorite_count))

return(tweets_geo)
}

Apply function

df1 <- create_tweet_df(tweets1)
df2 <- create_tweet_df(tweets2)
df3 <- create_tweet_df(tweets3)
df4 <- create_tweet_df(tweets4)
df5 <- create_tweet_df(tweets5)
df6 <- create_tweet_df(tweets6)
df7 <- create_tweet_df(tweets7)
df8 <- create_tweet_df(tweets8)
df9 <- create_tweet_df(tweets9)
df10 <- create_tweet_df(tweets10)
df11 <- create_tweet_df(tweets11)
df12 <- create_tweet_df(tweets12)
df13 <- create_tweet_df(tweets13)
df14 <- create_tweet_df(tweets14)
df15 <- create_tweet_df(tweets15)
df16 <- create_tweet_df(tweets16)
df17 <- create_tweet_df(tweets17)
df18 <- create_tweet_df(tweets18)
df19 <- create_tweet_df(tweets19)
df20 <- create_tweet_df(tweets20)
df21 <- create_tweet_df(tweets21)
df22 <- create_tweet_df(tweets22)
df23 <- create_tweet_df(tweets23)
df24 <- create_tweet_df(tweets24)
df25 <- create_tweet_df(tweets25)
df26 <- create_tweet_df(tweets26)
df27 <- create_tweet_df(tweets27)
df28 <- create_tweet_df(tweets28)
df29 <- create_tweet_df(tweets29)

Combine

all_df <- bind_rows(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14,df15, df16, df17, df18, df19, df20, df21, df22, df23, df24, df25, df26, df27, df28, df29) 

Remove points outside of our bounding box, which is c(-119.9,34.38,-119.5,34.48)

# create new df with just the tweet texts & usernames
tweet_data <- all_df %>%
    mutate(coords = gsub("\\)|c\\(", "", geo_coordinates)) %>%
    separate(coords, c("lat", "lon"), sep = ", ") %>%
    mutate_at(c("lon", "lat"), as.numeric) %>%
   filter(lat >=33.88 & lat <= 34.6,
          lon <= -119.5 & lon >= -120.5) %>%
  separate(created_at, into = c("Day", "Year"), sep = 26) %>%
  mutate(Year = as.numeric(Year)) %>%
  separate(Day, into = c("Day", "Date"), sep = 4) %>%
  separate(Date, into = c("Date", "Time"), sep = 7) %>%
  separate(Time, into = c("Time", "Extra"), sep = 9) %>%
  select(-Extra, -language, -geo_type, -Day) %>%
  separate(Date, into = c("Month", "Day"), sep = " ") %>%
  mutate(Day = as.numeric(Day)) %>%
  mutate(month_num = match(Month,month.abb)) %>%
  mutate(date = as.Date(paste0(month_num, "/", Day, "/",Year), tryFormats = "%m/%d/%Y"))

write_csv(tweet_data, "../data/geotagged_sb_tweets.csv")

#remove tweets from Jan-Apr 2015 because of the Twitter user interface change
tweet_data_sub <- tweet_data %>% filter(date > "2015-04-28")
write_csv(tweet_data_sub, "../data/geotagged_sb_tweets_post_apr_2015.csv") 

Map tweets

Map of all tweets

Turn the tweet_df_w_user_type data frame into a spatial object.

tweet_data <- read_csv("../data/geotagged_sb_tweets_post_apr_2015.csv")

tweet_sf <- tweet_data %>%
  st_as_sf(coords = c("lon", "lat")) %>%
  st_set_crs(4326)

Interactive with cluster markers

#map
map <- leaflet(tweet_data) %>%
  # Base groups
  addProviderTiles(providers$CartoDB.Positron) %>%
  # Overlay groups %>%
    addCircleMarkers(data = tweet_data, lng = ~lon, lat = ~lat, popup = ~full_text,
                   radius = 3, stroke = FALSE, fillOpacity = 0.5, clusterOptions = markerClusterOptions())  
map

Static tweet map

register_google(Sys.getenv("GOOGLE_ACCESS_TOKEN"))

#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 14, maptype = "toner-lite") 

ggmap(sb.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_point(data = tweet_data, aes(x = lon, y = lat),
               size = 0.55, alpha = 0.2, color = "darkorchid4") + 
  labs(fill = "User type",
       title = "Tweets in downtown Santa Barbara")

ggsave("../figs/all_tweets_sb_downtown.png")

Static map of downtown

cols      = c(brewer.pal(9,"OrRd")[2:9])

ggmap(sb.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_hex(data = tweet_data, aes(x=lon, y=lat, fill = cut(..count.., c(0, 5, 10, 50, 100,
                                    500, 1000, 2500, Inf))), bins=150) +
       scale_fill_manual(
        values = cols,
        labels = c("<5", "5-9", "10-49 ", "50-99 ",
                   "100-499 ", "500-999 ", "1000-2499 ", "2500+")
    ) +
  labs(fill = "# Tweets",
       title = "Tweets in Santa Barbara 2015-2019")

ggsave("../figs/all_tweets_sb_static_hex_map.png")

Static map of whole area

#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 11, maptype = "toner-lite") 

ggmap(sb.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_hex(data = tweet_data, aes(x=lon, y=lat, fill = cut(..count.., c(0, 5, 10, 50, 100,
                                    500, 1000, 2500, Inf))), bins=150) +
       scale_fill_manual(
        values = cols,
        labels = c("<5 ", "5-9", "10-49 ", "50-99 ",
                   "100-499 ", "500-999 ", "1000-2499 ", "2500+")
    ) +
  labs(fill = "# Tweets",
       title = "Tweets in larger SB area 2015-2019")

Interactive hex density

Get hex density by overlaying with points

hex_grid <- read_sf("../data/sb_area_hexagons.shp") %>%
  st_transform(st_crs(tweet_sf))

hex_tweet_count <- hex_grid %>%
  mutate(tweet_count = lengths(st_intersects(hex_grid, tweet_sf)))

mapview(hex_tweet_count %>% filter(tweet_count > 0), zcol = "tweet_count", layer.name = "# tweets")

Why are there so many tweets near De La Vina and Arrellaga hospital? Let’s take a closer look at tweets by geo_coordinates

geo_tweets <- tweet_data %>%
  group_by(geo_coordinates) %>%
  summarize(count = n()) %>%
  arrange(desc(count))

head(geo_tweets)
## # A tibble: 6 x 2
##   geo_coordinates               count
##   <chr>                         <int>
## 1 c(34.4258, -119.714)          11572
## 2 c(34.42, -119.7)               2062
## 3 c(34.4337, -119.632)            913
## 4 c(34.39916667, -119.51638889)   704
## 5 c(34.41938, -119.69905)         666
## 6 c(34.4405, -119.814)            511

So one coordinate has 11,489 tweets from it. The next highest is just 2019 tweets.

sb.zoom.map <- get_map(location = c( -119.7158247, 34.4262342), zoom = 17, maptype = "toner-lite")

ggmap(sb.zoom.map,  legend="none") +
  coord_equal() +
    labs(x = NULL, y = NULL) +
    theme(axis.text = element_blank()) +
    geom_hex(data = tweet_data, bins = 50)

The light blue point is equal to the coordinates c(34.4258, -119.714). I think this is the default coord when someone tags Santa Barbara. First clue is that there is nothing of significance at this location, it is a residential area. Let’s take a look at a handful of tweets coming from here.

delavina_tweets <- tweet_data %>%
  filter(geo_coordinates == "c(34.4258, -119.714)")

kable(sample_n(delavina_tweets, 10)) %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 10, fixed_thead = T)
Month Day Time Year full_text user_id user_location geo_coordinates retweet_count favorite_count lat lon month_num date
Mar 18 21:02:57 2018 J-R Marathon &amp; Mystical Traveler screening in Santa Barbara, Ca send light to Montecito "For… https://t.co/in4u7fWCeP 1.554213e+07 Santa Monica c(34.4258, -119.714) 0 0 34.4258 -119.714 3 2018-03-18
Nov 22 18:07:29 2017 Anyone else’s mom ALSO their #bestie?! ❤️ Cannot wait for Elizabeth Leibovitz to arrive this… https://t.co/J8KoX5JUVL 2.435284e+09 Denton, TX c(34.4258, -119.714) 0 0 34.4258 -119.714 11 2017-11-22
Jan 11 21:33:06 2016 DECCO Champagne Cage chair by Stabiles with Airplant Garden and Lamp. My Official Entry pic😎… https://t.co/7fLn7je3fc 1.668713e+09 Santa Barbara, California c(34.4258, -119.714) 0 0 34.4258 -119.714 1 2016-01-11
Jan 31 17:28:13 2018 One more from the weekend. @ Santa Barbara, California https://t.co/2GIqbdMqWp 1.671418e+07 California c(34.4258, -119.714) 0 4 34.4258 -119.714 1 2018-01-31
Jan 27 02:56:47 2018 Don’t ever give up on your dreams. You can do it! #followyourdreams #jacuzzi… https://t.co/V3zslaYCDq 8.483431e+17 Guarding the Universe c(34.4258, -119.714) 0 0 34.4258 -119.714 1 2018-01-27
Aug 3 20:45:35 2018 Wine tasting…living my best life. 🍷🍷🍷 . . . . . #wine #winelover #santabarbara #roadtrip #saturday #weekend #fbf #flashbackfriday #friends #girlfriends… https://t.co/mchy8fwVat 2.042687e+07 Los Angeles c(34.4258, -119.714) 2 0 34.4258 -119.714 8 2018-08-03
Jan 1 09:24:07 2016 Because kicking off 2016 dancing on a bench is clearly the best way to start the new year! #nye… https://t.co/dQhJtc3ko6 2.807708e+07 California c(34.4258, -119.714) 0 0 34.4258 -119.714 1 2016-01-01
Jul 14 03:04:23 2016 One of those dresses that makes you feel like💃🏼 privacypls | https://t.co/fUflBTb1Ohhttps://t.co/5bpbYjfsG2 3.383952e+09 Los Angeles, CA c(34.4258, -119.714) 0 0 34.4258 -119.714 7 2016-07-14
Apr 8 15:51:14 2019 Just posted a photo @ Santa Barbara, California https://t.co/rcdLHkGz8o 1.495026e+07 San Jose, Berkeley, LA 🗼🌉🏙 c(34.4258, -119.714) 0 0 34.4258 -119.714 4 2019-04-08
Jul 29 00:52:59 2018 🗓My Days JULY/28/2018🗓 🗝" #LOVEAF "🗝 #IAlone #ILoveYouALL #Analytics #NEVERQuitting #CapturedByCAL ____________________________________ #RecordingArtist #LifestyleBlogger #BrandAmbassador… https://t.co/YBakT9K4dz 2.243870e+08 Cerritos, CA c(34.4258, -119.714) 0 0 34.4258 -119.714 7 2018-07-29

In mid-2019, Twitter removed the ability to precisely identify your location in a tweet. I want to see the time frame for most of these tweets. If the majority are in the later half of 2019, it might be worth it to remove those tweets…

check_geo <- tweet_data  %>%
  filter(geo_coordinates == "c(34.4258, -119.714)") %>%
  group_by(date) %>%
  summarize(count = n())

ggplot(check_geo, aes(x = date, y = count)) +
  geom_line() +
  geom_smooth()

It doesn’t look like a significant dropoff in 2019, infact there are so few at the beginning of the time series I wonder if they implemented that default coordinate later on.

Look at the log of tweet count.

hex_tweet_count <- hex_grid %>%
  mutate(tweet_count = lengths(st_intersects(hex_grid, tweet_sf)),
         log_tweet_count = log(tweet_count),
         bin = case_when(
           tweet_count < 10 ~ 10,
           tweet_count >= 10 & tweet_count < 50 ~ 50,
           tweet_count >= 50 & tweet_count < 100 ~ 100,
           tweet_count >= 100 & tweet_count < 500 ~ 500,
           tweet_count >= 500 & tweet_count < 1000 ~ 1000,
           tweet_count >= 1000 & tweet_count < 2000 ~ 2000,
           tweet_count >= 2000 ~ 2001
         ))

log_hex_map <-mapview(hex_tweet_count %>% filter(tweet_count > 0), #remove hexes with no tweets
                 zcol = "log_tweet_count", layer.name = "Tweet count (log)")

log_hex_map